From c28478383c1a628d42866803d785e1f23baa33be Mon Sep 17 00:00:00 2001
From: Debarshi Ray <debarshir@gnome.org>
Date: Thu, 21 Dec 2017 10:14:53 +0100
Subject: [PATCH] CIE: Use a faster cbrtf implementation

This is the approximate cube root of an IEEE float implementation from
Hacker's Delight. The elimination of all conditional branches probably
makes it a better candidate for future SIMD accelerated code paths.

On an Intel i7 Haswell, it now takes 0.27s to convert a 15 megapixel
buffer from "RGBA float" to "CIE Lab alpha float" instead of the
earlier 0.35s. A "Y float" to "CIE L float" conversion takes 0.085s
instead of 0.102s.

Original code: http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt
Permissions: http://www.hackersdelight.org/permissions.htm

https://bugzilla.gnome.org/show_bug.cgi?id=791837
---
 extensions/CIE.c | 61 +++++++++++-------------------------------------
 1 file changed, 14 insertions(+), 47 deletions(-)

diff --git a/extensions/CIE.c b/extensions/CIE.c
index bd9e836..b6fa513 100644
--- a/extensions/CIE.c
+++ b/extensions/CIE.c
@@ -565,61 +565,28 @@ lchaba_to_rgba (const Babl *conversion,char *src,
 
 /******** begin floating point RGB/CIE color space conversions ********/
 
-/* origin: FreeBSD /usr/src/lib/msun/src/s_cbrtf.c */
-/*
- * Conversion to float by Ian Lance Taylor, Cygnus Support, ian@cygnus.com.
- * Debugged and optimized by Bruce D. Evans.
- */
-/*
- * ====================================================
- * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
- *
- * Developed at SunPro, a Sun Microsystems, Inc. business.
- * Permission to use, copy, modify, and distribute this
- * software is freely granted, provided that this notice
- * is preserved.
- * ====================================================
+/* origin: http://www.hackersdelight.org/hdcodetxt/acbrt.c.txt
+ * permissions: http://www.hackersdelight.org/permissions.htm
  */
 /* _cbrtf(x)
  * Return cube root of x
  */
 
-#include <math.h>
 #include <stdint.h>
 
-static const unsigned
-B1 = 709958130, /* B1 = (127-127.0/3-0.03306235651)*2**23 */
-B2 = 642849266; /* B2 = (127-127.0/3-24/3-0.03306235651)*2**23 */
-
-static inline float _cbrtf(float x)
+static inline float
+_cbrtf (float x)
 {
-	float r,T;
-	union {float f; uint32_t i;} u = {x};
-	uint32_t hx = u.i & 0x7fffffff;
-
-	if (hx >= 0x7f800000)  /* cbrt(NaN,INF) is itself */
-		return x + x;
-
-	/* rough cbrt to 5 bits */
-	if (hx < 0x00800000) {  /* zero or subnormal? */
-		if (hx == 0)
-			return x;  /* cbrt(+-0) is itself */
-		u.f = x*0x1p24f;
-		hx = u.i & 0x7fffffff;
-		hx = hx/3 + B2;
-	} else
-		hx = hx/3 + B1;
-	u.i &= 0x80000000;
-	u.i |= hx;
-
-	T = u.f;
-	r = T*T*T;
-	T = T*((float)x+x+r)/(x+r+r);
-
-	r = T*T*T;
-	T = T*((float)x+x+r)/(x+r+r);
-
-	return T;
+  union { float f; uint32_t i; } u = { x };
+
+  u.i = u.i / 4 + u.i / 16;
+  u.i = u.i + u.i / 16;
+  u.i = u.i + u.i / 256;
+  u.i = 0x2a5137a0 + u.i;
+  u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+  u.f = 0.33333333f * (2.0f * u.f + x / (u.f * u.f));
+
+  return u.f;
 }
 
 static inline float
-- 
2.30.2